import re


with open('test.html', 'r', encoding='utf-8') as rf:
    test_html = rf.read()

dates = re.findall(r'202[345]年\d+月\d+日', test_html)

print(set(dates))


links = re.findall(r'href="(http.*?)"', test_html)
for l in set(links):
    print(l)
